# Re-import libraries and reload dataset due to kernel reset
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from itertools import permutations
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination
from itertools import permutations
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
import warnings

warnings.filterwarnings("ignore")

# Load data
file_path = r"D:\RENJIN RAJU\MASTERS\SEMESTER 3\THESIS\Research\Analysis\Survey data cleaned.csv"
df = pd.read_csv(file_path, encoding="ISO-8859-1")

# Select technology columns for analysis
tech_cols = [
    'adoption_Artificial_Intelligence_AI', 'adoption_Internet_of_Things_IoT',
    'adoption_Blockchain', 'adoption_Cloud_Computing',
    'adoption_Robotic_and_Automation', 'adoption_Big_Data_Analytics',
    'adoption_Simulation'
]

# 1. Correlation Heatmap
tech_corr = df[tech_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(tech_corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Between Technology Adoptions")
plt.tight_layout()
plt.savefig("tech_correlation_heatmap_new.png")
plt.show()

# 2. Association Rules for All Tech Co-Adoptions
df_apriori_all = df[tech_cols].copy().applymap(lambda x: True if x > 0 else False)
frequent_items_all = apriori(df_apriori_all, min_support=0.05, use_colnames=True)
rules_all = association_rules(frequent_items_all, metric="confidence", min_threshold=0.4)
rules_all_sorted = rules_all.sort_values(by="confidence", ascending=False)
rules_all_sorted.to_csv("association_rules_output.csv", index=False)

# 3. PSM Between Each Tech Pair
psm_results = []
psm_features = ['Disruption_Global_Crises', 'Disruption_Cybersecurity_Threats',
                'barriers_High_Implementation_Costs', 'barriers_Resistance_to_Change']

for outcome_tech in tech_cols:
    for treatment_tech in tech_cols:
        if treatment_tech == outcome_tech:
            continue

        df['Treatment'] = df[treatment_tech].apply(lambda x: 1 if x > 0 else 0)
        X_psm = df[psm_features]
        y_psm = df['Treatment']
        model = LogisticRegression()
        model.fit(X_psm, y_psm)
        df['propensity_score'] = model.predict_proba(X_psm)[:, 1]

        treated = df[df['Treatment'] == 1]
        control = df[df['Treatment'] == 0]

        if treated.empty or control.empty:
            continue

        neighbors = NearestNeighbors(n_neighbors=1).fit(control[['propensity_score']])
        distances, indices = neighbors.kneighbors(treated[['propensity_score']])
        matched_control = control.iloc[indices.flatten()]

        treated_outcome = treated[outcome_tech].apply(lambda x: 1 if x > 0 else 0).reset_index(drop=True)
        control_outcome = matched_control[outcome_tech].apply(lambda x: 1 if x > 0 else 0).reset_index(drop=True)

        if not control_outcome.empty:
            ate = treated_outcome.mean() - control_outcome.mean()
            psm_results.append({
                "Treatment_Tech": treatment_tech,
                "Outcome_Tech": outcome_tech,
                "ATE": round(ate, 3)
            })

# Format output tables
psm_df = pd.DataFrame(psm_results).sort_values(by="ATE", ascending=False)
rules_top = rules_all_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10)
psm_df.to_csv("psm_tech_pair_results.csv", index=False)



# 4. Bayesian Network Modeling
df_bn = df[tech_cols].copy().applymap(lambda x: 1 if x > 0 else 0).astype(int)

edges = [
    ('adoption_Artificial_Intelligence_AI', 'adoption_Cloud_Computing'),
    ('adoption_Artificial_Intelligence_AI', 'adoption_Big_Data_Analytics'),
    ('adoption_Internet_of_Things_IoT', 'adoption_Cloud_Computing'),
    ('adoption_Blockchain', 'adoption_Cloud_Computing'),
    ('adoption_Big_Data_Analytics', 'adoption_Robotic_and_Automation'),
    ('adoption_Simulation', 'adoption_Robotic_and_Automation')
]

model = BayesianNetwork(edges)
model.fit(df_bn, estimator=BayesianEstimator, prior_type="BDeu")
with open("bayesian_cpds.txt", "w") as f:
    for cpd in model.get_cpds():
        f.write(str(cpd) + "\\n\\n")

# Graphical Visualization
G = nx.DiGraph()
G.add_edges_from(model.edges())
plt.figure(figsize=(12, 9))
nx.draw(G, with_labels=True, node_size=3000, node_color='skyblue', font_size=10, font_weight='bold', arrows=True)
plt.title("Bayesian Network: Technology Adoption Influence Structure")
plt.tight_layout()
plt.savefig("bayesian_network_visual.png")
plt.close()

# Conditional Probability Queries
inference = VariableElimination(model)
query_results_all = []
for target, evidence in permutations(tech_cols, 2):
    try:
        result = inference.query(variables=[target], evidence={evidence: 1}).values[1]
        query_results_all.append({
            "Query": f"P({target}=True | {evidence}=True)",
            "Result": round(result, 4)
        })
    except Exception as e:
        query_results_all.append({
            "Query": f"P({target}=True | {evidence}=True)",
            "Result": f"Error: {str(e)}"
        })

# Convert to DataFrame and export
query_all_df = pd.DataFrame(query_results_all)
query_all_df.to_csv("bayesian_query_results.csv", index=False)
print("All outputs saved.")

# Heatmap of conditional probabilities
def extract_pair(q):
    try:
        lhs, rhs = q.split('|')
        target = lhs.strip().replace('P(', '').replace('=True)', '').strip()
        evidence = rhs.strip().replace('=True)', '').strip()
        return target, evidence
    except:
        return None, None
matrix_data = {}
for _, row in query_all_df.iterrows():
    target, evidence = extract_pair(row['Query'])
    if target and evidence:
        matrix_data.setdefault(evidence, {})[target] = row['Result'] if isinstance(row['Result'], (float, int)) else None
heatmap_df = pd.DataFrame(matrix_data).T
plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_df, annot=True, fmt=".2f", cmap="YlGnBu", cbar_kws={'label': 'P(Target=True | Evidence=True)'})
plt.title("Conditional Probability Heatmap (Bayesian Inference)")
plt.xlabel("Target Technology")
plt.ylabel("Evidence Technology")
plt.tight_layout()
plt.savefig("bayesian_heatmap.png")
plt.close()

# Influence Ranking
influence_scores = {}
for node in G.nodes():
    out_edges = G.out_edges(node, data=True)
    total_influence = sum(inference.query(variables=[v], evidence={node: 1}).values[1]
                          for _, v in G.out_edges(node))
    influence_scores[node] = round(total_influence, 3)
influence_df = pd.DataFrame(list(influence_scores.items()), columns=["Technology", "Total_Influence"])
influence_df = influence_df.sort_values(by="Total_Influence", ascending=False)
influence_df.to_csv("bayesian_influence_ranking.csv", index=False)

# Agglomerative Clustering
heatmap_filled = heatmap_df.fillna(0)
dist_matrix = pdist(heatmap_filled.values, metric='euclidean')
dist_square = squareform(dist_matrix)
agglo = AgglomerativeClustering(n_clusters=3)
agglo_labels = agglo.fit_predict(heatmap_filled.values)
cluster_df = pd.DataFrame({"Technology": heatmap_filled.index, "Agglomerative_Cluster": agglo_labels})
cluster_df.to_csv("tech_clusters_agglomerative.csv", index=False)

# KMeans Clustering & PCA
X = heatmap_filled.values
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X)
plt.figure(figsize=(10, 7))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap="Set2", s=200)
for i, name in enumerate(heatmap_filled.index):
    plt.text(X_pca[i, 0], X_pca[i, 1], name.replace("adoption_", ""), fontsize=9)
plt.title("KMeans Clustering of Technologies Based on Bayesian Influence")
plt.tight_layout()
plt.savefig("kmeans_cluster_tech_influence.png")
plt.close()

G = nx.DiGraph()
for evidence in heatmap_df.index:
    for target in heatmap_df.columns:
        prob = heatmap_df.at[evidence, target]
        if pd.notnull(prob):
            G.add_edge(evidence, target, weight=prob)

plt.figure(figsize=(12, 10))
pos = nx.spring_layout(G, k=0.7, iterations=50)
weights = [G[u][v]['weight'] * 5 for u, v in G.edges()]
labels = {f"{u}→{v}": f"{G[u][v]['weight']:.2f}" for u, v in G.edges()}

nx.draw_networkx_nodes(G, pos, node_size=3000, node_color='lightblue')
nx.draw_networkx_edges(G, pos, width=weights, arrows=True, edge_color='gray')
nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
edge_labels = {(u, v): f"{G[u][v]['weight']:.2f}" for u, v in G.edges()}
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='darkred', font_size=8)

plt.title("Bayesian Influence Network (Edge Weight = P(Target | Evidence))")
plt.axis('off')
plt.tight_layout()
plt.savefig("bayesian_influence_network.png")
plt.close()